{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "087f6451",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "# import geopandas as gpd\n",
    "# import rioxarray as rxr\n",
    "# import rasterio \n",
    "import xarray as xr\n",
    "import earthpy as et\n",
    "from multiprocessing import Pool\n",
    "from parallel_xarray_Senegal import sample_tif\n",
    "from get_dynamic_by_date_Senegal import get_date_df\n",
    "import pandas as pd \n",
    "from shapely import geometry\n",
    "import datetime\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from dask import dataframe as dd\n",
    "import dask_ml\n",
    "import joblib\n",
    "from sklearn.utils import parallel_backend\n",
    "import pickle \n",
    "import sklearn"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "a6154cf1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "    <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n",
       "    <div style=\"margin-left: 48px;\">\n",
       "        <h3 style=\"margin-bottom: 0px;\">Client</h3>\n",
       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-da3accbf-2a38-11ee-b0e4-ac1f6b12e433</p>\n",
       "        <table style=\"width: 100%; text-align: left;\">\n",
       "\n",
       "        <tr>\n",
       "        \n",
       "            <td style=\"text-align: left;\"><strong>Connection method:</strong> Cluster object</td>\n",
       "            <td style=\"text-align: left;\"><strong>Cluster type:</strong> distributed.LocalCluster</td>\n",
       "        \n",
       "        </tr>\n",
       "\n",
       "        \n",
       "            <tr>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
       "                </td>\n",
       "                <td style=\"text-align: left;\"></td>\n",
       "            </tr>\n",
       "        \n",
       "\n",
       "        </table>\n",
       "\n",
       "        \n",
       "\n",
       "        \n",
       "            <details>\n",
       "            <summary style=\"margin-bottom: 20px;\"><h3 style=\"display: inline;\">Cluster Info</h3></summary>\n",
       "            <div class=\"jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-output\">\n",
       "    <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\">\n",
       "    </div>\n",
       "    <div style=\"margin-left: 48px;\">\n",
       "        <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCluster</h3>\n",
       "        <p style=\"color: #9D9D9D; margin-bottom: 0px;\">856643bb</p>\n",
       "        <table style=\"width: 100%; text-align: left;\">\n",
       "            <tr>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
       "                </td>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Workers:</strong> 6\n",
       "                </td>\n",
       "            </tr>\n",
       "            <tr>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Total threads:</strong> 6\n",
       "                </td>\n",
       "                <td style=\"text-align: left;\">\n",
       "                    <strong>Total memory:</strong> 127.88 GiB\n",
       "                </td>\n",
       "            </tr>\n",
       "            \n",
       "            <tr>\n",
       "    <td style=\"text-align: left;\"><strong>Status:</strong> running</td>\n",
       "    <td style=\"text-align: left;\"><strong>Using processes:</strong> True</td>\n",
       "</tr>\n",
       "\n",
       "            \n",
       "        </table>\n",
       "\n",
       "        <details>\n",
       "            <summary style=\"margin-bottom: 20px;\">\n",
       "                <h3 style=\"display: inline;\">Scheduler Info</h3>\n",
       "            </summary>\n",
       "\n",
       "            <div style=\"\">\n",
       "    <div>\n",
       "        <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n",
       "        <div style=\"margin-left: 48px;\">\n",
       "            <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n",
       "            <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-d026d1df-e71b-4a93-92ad-ee2015a06a80</p>\n",
       "            <table style=\"width: 100%; text-align: left;\">\n",
       "                <tr>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Comm:</strong> tcp://127.0.0.1:60965\n",
       "                    </td>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Workers:</strong> 6\n",
       "                    </td>\n",
       "                </tr>\n",
       "                <tr>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n",
       "                    </td>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Total threads:</strong> 6\n",
       "                    </td>\n",
       "                </tr>\n",
       "                <tr>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Started:</strong> Just now\n",
       "                    </td>\n",
       "                    <td style=\"text-align: left;\">\n",
       "                        <strong>Total memory:</strong> 127.88 GiB\n",
       "                    </td>\n",
       "                </tr>\n",
       "            </table>\n",
       "        </div>\n",
       "    </div>\n",
       "\n",
       "    <details style=\"margin-left: 48px;\">\n",
       "        <summary style=\"margin-bottom: 20px;\">\n",
       "            <h3 style=\"display: inline;\">Workers</h3>\n",
       "        </summary>\n",
       "\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 0</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:60997\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:61005/status\" target=\"_blank\">http://127.0.0.1:61005/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 21.31 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:60968\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> C:\\Users\\colorado\\AppData\\Local\\Temp\\dask-worker-space\\worker-jnh012pv\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 1</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:60995\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:61006/status\" target=\"_blank\">http://127.0.0.1:61006/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 21.31 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:60969\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> C:\\Users\\colorado\\AppData\\Local\\Temp\\dask-worker-space\\worker-zw5eb_q7\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 2</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:60996\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:61004/status\" target=\"_blank\">http://127.0.0.1:61004/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 21.31 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:60970\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> C:\\Users\\colorado\\AppData\\Local\\Temp\\dask-worker-space\\worker-k5u67tya\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 3</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:60994\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:61002/status\" target=\"_blank\">http://127.0.0.1:61002/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 21.31 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:60971\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> C:\\Users\\colorado\\AppData\\Local\\Temp\\dask-worker-space\\worker-lz01dn8n\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 4</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:60993\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:60999/status\" target=\"_blank\">http://127.0.0.1:60999/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 21.31 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:60972\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> C:\\Users\\colorado\\AppData\\Local\\Temp\\dask-worker-space\\worker-vf783itf\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "        <div style=\"margin-bottom: 20px;\">\n",
       "            <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n",
       "            <div style=\"margin-left: 48px;\">\n",
       "            <details>\n",
       "                <summary>\n",
       "                    <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 5</h4>\n",
       "                </summary>\n",
       "                <table style=\"width: 100%; text-align: left;\">\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Comm: </strong> tcp://127.0.0.1:60992\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Total threads: </strong> 1\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:60998/status\" target=\"_blank\">http://127.0.0.1:60998/status</a>\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Memory: </strong> 21.31 GiB\n",
       "                        </td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td style=\"text-align: left;\">\n",
       "                            <strong>Nanny: </strong> tcp://127.0.0.1:60973\n",
       "                        </td>\n",
       "                        <td style=\"text-align: left;\"></td>\n",
       "                    </tr>\n",
       "                    <tr>\n",
       "                        <td colspan=\"2\" style=\"text-align: left;\">\n",
       "                            <strong>Local directory: </strong> C:\\Users\\colorado\\AppData\\Local\\Temp\\dask-worker-space\\worker-h05dwvy9\n",
       "                        </td>\n",
       "                    </tr>\n",
       "\n",
       "                    \n",
       "\n",
       "                    \n",
       "\n",
       "                </table>\n",
       "            </details>\n",
       "            </div>\n",
       "        </div>\n",
       "        \n",
       "\n",
       "    </details>\n",
       "</div>\n",
       "\n",
       "        </details>\n",
       "    </div>\n",
       "</div>\n",
       "            </details>\n",
       "        \n",
       "\n",
       "    </div>\n",
       "</div>"
      ],
      "text/plain": [
       "<Client: 'tcp://127.0.0.1:60965' processes=6 threads=6, memory=127.88 GiB>"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# for distributed Dask processes, instantiate a client \n",
    "from dask.distributed import Client\n",
    "client = Client(n_workers = 6, threads_per_worker = 1)\n",
    "client"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "efecc8cd",
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "# Get all turkana files \n",
    "\n",
    "parquet_path = \"F:\\\\Senegal_Veg_Model\\\\parquet\"\n",
    "# get name function based on lulc \n",
    "crop_files = [os.path.join(parquet_path, file) for file in os.listdir(parquet_path) if 'crop' in file]\n",
    "# map to new parquet files \n",
    "crop = dd.read_parquet(crop_files)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "c3afeae9",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>date</th>\n",
       "      <th>lat</th>\n",
       "      <th>lon</th>\n",
       "      <th>EVI</th>\n",
       "      <th>16_day_sum_ppt_x</th>\n",
       "      <th>32_day_sum_ppt_x</th>\n",
       "      <th>48_day_sum_ppt_x</th>\n",
       "      <th>64_day_sum_ppt_x</th>\n",
       "      <th>16_day_mean_ppt</th>\n",
       "      <th>32_day_mean_ppt</th>\n",
       "      <th>...</th>\n",
       "      <th>year</th>\n",
       "      <th>month</th>\n",
       "      <th>16_day_sum_ppt_Z</th>\n",
       "      <th>32_day_sum_ppt_Z</th>\n",
       "      <th>48_day_sum_ppt_Z</th>\n",
       "      <th>64_day_sum_ppt_Z</th>\n",
       "      <th>16_day_mean_temp_Z</th>\n",
       "      <th>32_day_mean_temp_Z</th>\n",
       "      <th>48_day_mean_temp_Z</th>\n",
       "      <th>64_day_mean_temp_Z</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>2014-05-06</td>\n",
       "      <td>12.842591</td>\n",
       "      <td>-15.240212</td>\n",
       "      <td>0.475712</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000694</td>\n",
       "      <td>0.000694</td>\n",
       "      <td>0.000694</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000022</td>\n",
       "      <td>...</td>\n",
       "      <td>2014</td>\n",
       "      <td>5</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-0.995613</td>\n",
       "      <td>-0.995621</td>\n",
       "      <td>-0.995644</td>\n",
       "      <td>-0.004785</td>\n",
       "      <td>-0.017956</td>\n",
       "      <td>-0.011308</td>\n",
       "      <td>-0.014911</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2014-05-06</td>\n",
       "      <td>12.851350</td>\n",
       "      <td>-15.240212</td>\n",
       "      <td>0.614420</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000694</td>\n",
       "      <td>0.000694</td>\n",
       "      <td>0.000694</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000022</td>\n",
       "      <td>...</td>\n",
       "      <td>2014</td>\n",
       "      <td>5</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-0.995613</td>\n",
       "      <td>-0.995621</td>\n",
       "      <td>-0.995644</td>\n",
       "      <td>-0.004785</td>\n",
       "      <td>-0.017956</td>\n",
       "      <td>-0.011308</td>\n",
       "      <td>-0.014911</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2014-05-06</td>\n",
       "      <td>12.860108</td>\n",
       "      <td>-15.240212</td>\n",
       "      <td>0.567534</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000694</td>\n",
       "      <td>0.000694</td>\n",
       "      <td>0.000694</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000022</td>\n",
       "      <td>...</td>\n",
       "      <td>2014</td>\n",
       "      <td>5</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-0.995613</td>\n",
       "      <td>-0.995621</td>\n",
       "      <td>-0.995644</td>\n",
       "      <td>-0.004785</td>\n",
       "      <td>-0.017956</td>\n",
       "      <td>-0.011308</td>\n",
       "      <td>-0.014911</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>2014-05-06</td>\n",
       "      <td>12.930160</td>\n",
       "      <td>-15.240212</td>\n",
       "      <td>0.611100</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000791</td>\n",
       "      <td>0.000791</td>\n",
       "      <td>0.000791</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000025</td>\n",
       "      <td>...</td>\n",
       "      <td>2014</td>\n",
       "      <td>5</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-0.994445</td>\n",
       "      <td>-0.994468</td>\n",
       "      <td>-0.994560</td>\n",
       "      <td>-0.004785</td>\n",
       "      <td>-0.017956</td>\n",
       "      <td>-0.011308</td>\n",
       "      <td>-0.014911</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>2014-05-06</td>\n",
       "      <td>12.938916</td>\n",
       "      <td>-15.240212</td>\n",
       "      <td>0.617756</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000791</td>\n",
       "      <td>0.000791</td>\n",
       "      <td>0.000791</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000025</td>\n",
       "      <td>...</td>\n",
       "      <td>2014</td>\n",
       "      <td>5</td>\n",
       "      <td>-1.0</td>\n",
       "      <td>-0.994445</td>\n",
       "      <td>-0.994468</td>\n",
       "      <td>-0.994560</td>\n",
       "      <td>-0.004785</td>\n",
       "      <td>-0.017956</td>\n",
       "      <td>-0.011308</td>\n",
       "      <td>-0.014911</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 57 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        date        lat        lon       EVI  16_day_sum_ppt_x  \\\n",
       "0 2014-05-06  12.842591 -15.240212  0.475712               0.0   \n",
       "1 2014-05-06  12.851350 -15.240212  0.614420               0.0   \n",
       "2 2014-05-06  12.860108 -15.240212  0.567534               0.0   \n",
       "3 2014-05-06  12.930160 -15.240212  0.611100               0.0   \n",
       "4 2014-05-06  12.938916 -15.240212  0.617756               0.0   \n",
       "\n",
       "   32_day_sum_ppt_x  48_day_sum_ppt_x  64_day_sum_ppt_x  16_day_mean_ppt  \\\n",
       "0          0.000694          0.000694          0.000694              0.0   \n",
       "1          0.000694          0.000694          0.000694              0.0   \n",
       "2          0.000694          0.000694          0.000694              0.0   \n",
       "3          0.000791          0.000791          0.000791              0.0   \n",
       "4          0.000791          0.000791          0.000791              0.0   \n",
       "\n",
       "   32_day_mean_ppt  ...  year  month  16_day_sum_ppt_Z  32_day_sum_ppt_Z  \\\n",
       "0         0.000022  ...  2014      5              -1.0         -0.995613   \n",
       "1         0.000022  ...  2014      5              -1.0         -0.995613   \n",
       "2         0.000022  ...  2014      5              -1.0         -0.995613   \n",
       "3         0.000025  ...  2014      5              -1.0         -0.994445   \n",
       "4         0.000025  ...  2014      5              -1.0         -0.994445   \n",
       "\n",
       "   48_day_sum_ppt_Z  64_day_sum_ppt_Z  16_day_mean_temp_Z  32_day_mean_temp_Z  \\\n",
       "0         -0.995621         -0.995644           -0.004785           -0.017956   \n",
       "1         -0.995621         -0.995644           -0.004785           -0.017956   \n",
       "2         -0.995621         -0.995644           -0.004785           -0.017956   \n",
       "3         -0.994468         -0.994560           -0.004785           -0.017956   \n",
       "4         -0.994468         -0.994560           -0.004785           -0.017956   \n",
       "\n",
       "   48_day_mean_temp_Z  64_day_mean_temp_Z  \n",
       "0           -0.011308           -0.014911  \n",
       "1           -0.011308           -0.014911  \n",
       "2           -0.011308           -0.014911  \n",
       "3           -0.011308           -0.014911  \n",
       "4           -0.011308           -0.014911  \n",
       "\n",
       "[5 rows x 57 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# take a look at the data\n",
    "crop.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "e6dac155",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['date', 'lat', 'lon', 'EVI', '16_day_sum_ppt_x', '32_day_sum_ppt_x',\n",
       "       '48_day_sum_ppt_x', '64_day_sum_ppt_x', '16_day_mean_ppt',\n",
       "       '32_day_mean_ppt', '48_day_mean_ppt', '64_day_mean_ppt',\n",
       "       '16_day_stdv_ppt', '32_day_stdv_ppt', '48_day_stdv_ppt',\n",
       "       '64_day_stdv_ppt', '16_day_mean_temp_x', '32_day_mean_temp_x',\n",
       "       '48_day_mean_temp_x', '64_day_mean_temp_x', '16_day_stdv_temp',\n",
       "       '32_day_stdv_temp', '48_day_stdv_temp', '64_day_stdv_temp',\n",
       "       'Unnamed: 0.1', 'FID', 'lulc', 'slope', 'elevation',\n",
       "       'bdod0_100cm_inv_weight_mean', 'cfvo0_100cm_inv_weight_mean',\n",
       "       'clay0_100cm_inv_weight_mean', 'sand0_100cm_inv_weight_mean',\n",
       "       'silt0_100cm_inv_weight_mean', 'soc0_100cm_inv_weight_mean',\n",
       "       'bdod0_30cm_weight_mean', 'cfvo0_30cm_weight_mean',\n",
       "       'clay0_30cm_weight_mean', 'sand0_30cm_weight_mean',\n",
       "       'silt0_30cm_weight_mean', 'soc0_30cm_weight_mean',\n",
       "       'bdod30_100cm_weight_mean', 'cfvo30_100cm_weight_mean',\n",
       "       'clay30_100cm_weight_mean', 'sand30_100cm_weight_mean',\n",
       "       'silt30_100cm_weight_mean', 'soc30_100cm_weight_mean', 'year', 'month',\n",
       "       '16_day_sum_ppt_Z', '32_day_sum_ppt_Z', '48_day_sum_ppt_Z',\n",
       "       '64_day_sum_ppt_Z', '16_day_mean_temp_Z', '32_day_mean_temp_Z',\n",
       "       '48_day_mean_temp_Z', '64_day_mean_temp_Z'],\n",
       "      dtype='object')"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# look at the columns \n",
    "crop.columns"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "30185de2",
   "metadata": {},
   "source": [
    "### Random Forests - Explained Variance Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "eebfa59e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create a Random Forest Model to Predict SAVI Z score at given dates and points \n",
    "# should take in a dask dataframe, X and y parameters, and arguments for the random forest classifier (e.g. max depth) \n",
    "\n",
    "def ddTrain_Test_Split_Scale(df, X_vars, y_var):\n",
    "    # pull X and y variables from the dask df \n",
    "    df1 = df[~df.isin([np.nan, np.inf, -np.inf]).any(1)]\n",
    "    \n",
    "    X = df1[X_vars]\n",
    "    y = df1[y_var]\n",
    "    \n",
    "    # read in dask df and make train test split with whole dask dataframe \n",
    "    from dask_ml.model_selection import train_test_split\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle = True)\n",
    "    return (X_train.astype('float32'), X_test.astype('float32'), y_train.astype('float32'), y_test.astype('float32'))\n",
    "\n",
    "#     # scale data \n",
    "#     from dask_ml.preprocessing import StandardScaler\n",
    "#     # scale all data to the training datset - test set must be fully blind\n",
    "#     scaler_X = StandardScaler()\n",
    "#     scaler_X.fit(X_train)\n",
    "#     scaler_y = StandardScaler()\n",
    "#     scaler_y.fit(y_train.to_dask_array().compute().reshape(-1,1))\n",
    "    \n",
    "#     scaled_X_train = scaler_X.transform(X_train)\n",
    "#     scaled_X_test = scaler_X.transform(X_test)\n",
    "#     scaled_y_train = scaler_y.transform(y_train.to_dask_array().compute().reshape(-1,1))\n",
    "#     scaled_y_test = scaler_y.transform(y_test.to_dask_array().compute().reshape(-1,1))\n",
    "    \n",
    "#     return (scaler_X, scaler_y, scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test)\n",
    "    \n",
    "\n",
    "def ddRandom_Forest(scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test, **kwargs):\n",
    "    from dask_ml.wrappers import ParallelPostFit\n",
    "    # use a random forest classifier from sklearn\n",
    "    from sklearn.ensemble import RandomForestRegressor\n",
    "    model = RandomForestRegressor(bootstrap = True, verbose=1, **kwargs)\n",
    "    \n",
    "    # for Dask ML running in parallel \n",
    "    import joblib\n",
    "    from sklearn.metrics import r2_score\n",
    "    with joblib.parallel_backend(\"dask\"):\n",
    "        model.fit(scaled_X_train, scaled_y_train)\n",
    "        predictions = model.predict(scaled_X_test)\n",
    "        r2 = r2_score(scaled_y_test, predictions)\n",
    "    \n",
    "    # return the model, predictions, the accuracy, the scalers (so we can unscale as desired), train indices, and the scaled train and test sets \n",
    "    return (model, predictions, r2)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a1298331",
   "metadata": {},
   "source": [
    "### Full regression trees "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "id": "ad3453ee",
   "metadata": {},
   "outputs": [],
   "source": [
    "# quantile cutoffs \n",
    "Q1 = crop['EVI'].compute().quantile(.25)\n",
    "Q3 = crop['EVI'].compute().quantile(.75)\n",
    "max_EVI = crop['EVI'].compute().quantile(.99)\n",
    "min_EVI = crop['EVI'].compute().quantile(.01)\n",
    "full_df = crop[(crop['EVI']<=max_EVI) & (crop['EVI']>=min_EVI) & (crop['sand0_100cm_inv_weight_mean']!=0)]\n",
    "\n",
    "# get dataframes in given quantiles - filter out any points where soil data didnt come through \n",
    "q1_df = crop[(crop['EVI']<=Q1) & (crop['EVI']>=min_EVI) & (crop['sand0_100cm_inv_weight_mean']!=0)]\n",
    "iq_df = crop[(crop['EVI']>Q1) & (crop['EVI']<Q3) & (crop['sand0_100cm_inv_weight_mean']!=0)]\n",
    "q3_df = crop[(crop['EVI']>=Q3) & (crop['EVI']<=max_EVI)& (crop['sand0_100cm_inv_weight_mean']!=0)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "12a1c96f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Using backend DaskDistributedBackend with 6 concurrent workers.\n",
      "[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  2.5min finished\n",
      "[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.\n",
      "[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:    0.4s finished\n"
     ]
    }
   ],
   "source": [
    "# run model for first quartile EVI values \n",
    "# drop any records with nan values \n",
    "df = q1_df.dropna()\n",
    "# variables\n",
    "# do not use both sum and mean ppt measurements as they are scalar multiples of one another! \n",
    "# any variables ending in _x are the regular measuements, any variables ending in _Z are the z score of those measurements \n",
    "X_vars = ['16_day_sum_ppt_x', '32_day_sum_ppt_x',\n",
    "       '48_day_sum_ppt_x', '64_day_sum_ppt_x', \n",
    "       '16_day_stdv_ppt', '32_day_stdv_ppt', '48_day_stdv_ppt',\n",
    "       '64_day_stdv_ppt', '16_day_mean_temp_x', '32_day_mean_temp_x',\n",
    "       '48_day_mean_temp_x', '64_day_mean_temp_x', '16_day_stdv_temp',\n",
    "       '32_day_stdv_temp', '48_day_stdv_temp', '64_day_stdv_temp',\n",
    "       'slope', 'elevation',\n",
    "       'bdod0_30cm_weight_mean', 'cfvo0_30cm_weight_mean',\n",
    "       'clay0_30cm_weight_mean', 'sand0_30cm_weight_mean',\n",
    "       'silt0_30cm_weight_mean', 'soc0_30cm_weight_mean',\n",
    "       'bdod30_100cm_weight_mean', 'cfvo30_100cm_weight_mean',\n",
    "       'clay30_100cm_weight_mean', 'sand30_100cm_weight_mean',\n",
    "       'silt30_100cm_weight_mean', 'soc30_100cm_weight_mean', 'year', 'month',\n",
    "       '16_day_sum_ppt_Z', '32_day_sum_ppt_Z', '48_day_sum_ppt_Z',\n",
    "       '64_day_sum_ppt_Z', '16_day_mean_temp_Z', '32_day_mean_temp_Z',\n",
    "       '48_day_mean_temp_Z', '64_day_mean_temp_Z']\n",
    "y_var  = 'EVI'\n",
    "# get train test set\n",
    "scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test = ddTrain_Test_Split_Scale(df, X_vars, y_var)\n",
    "# train model and save output \n",
    "Q1_model, Q1_predictions, Q1_r2 = ddRandom_Forest(scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test, n_estimators=20, max_depth = 30)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "id": "a8cbb92a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.47909177753402354\n"
     ]
    }
   ],
   "source": [
    "print (Q1_r2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "id": "9d21f495",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pickle the model and the r2 value for access later! \n",
    "import pickle \n",
    "pckl = \"Q1_model.pkl\"\n",
    "with open(pckl,'wb') as f:\n",
    "    pickle.dump(Q1_model,f)\n",
    "        \n",
    "with open(\"Q1_r2.pkl\", 'wb') as f: \n",
    "    pickle.dump(Q1_r2, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "id": "2b7deea1",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Using backend DaskDistributedBackend with 6 concurrent workers.\n",
      "[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  4.6min finished\n",
      "[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.\n",
      "[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:    0.7s finished\n"
     ]
    }
   ],
   "source": [
    "# run model for inner quartile EVI values \n",
    "# drop any records with nan values \n",
    "df = iq_df.dropna()\n",
    "# variables\n",
    "# do not use both sum and mean ppt measurements as they are scalar multiples of one another! \n",
    "# any variables ending in _x are the regular measuements, any variables ending in _Z are the z score of those measurements \n",
    "X_vars = ['16_day_sum_ppt_x', '32_day_sum_ppt_x',\n",
    "       '48_day_sum_ppt_x', '64_day_sum_ppt_x', \n",
    "       '16_day_stdv_ppt', '32_day_stdv_ppt', '48_day_stdv_ppt',\n",
    "       '64_day_stdv_ppt', '16_day_mean_temp_x', '32_day_mean_temp_x',\n",
    "       '48_day_mean_temp_x', '64_day_mean_temp_x', '16_day_stdv_temp',\n",
    "       '32_day_stdv_temp', '48_day_stdv_temp', '64_day_stdv_temp',\n",
    "       'slope', 'elevation',\n",
    "       'bdod0_30cm_weight_mean', 'cfvo0_30cm_weight_mean',\n",
    "       'clay0_30cm_weight_mean', 'sand0_30cm_weight_mean',\n",
    "       'silt0_30cm_weight_mean', 'soc0_30cm_weight_mean',\n",
    "       'bdod30_100cm_weight_mean', 'cfvo30_100cm_weight_mean',\n",
    "       'clay30_100cm_weight_mean', 'sand30_100cm_weight_mean',\n",
    "       'silt30_100cm_weight_mean', 'soc30_100cm_weight_mean', 'year', 'month',\n",
    "       '16_day_sum_ppt_Z', '32_day_sum_ppt_Z', '48_day_sum_ppt_Z',\n",
    "       '64_day_sum_ppt_Z', '16_day_mean_temp_Z', '32_day_mean_temp_Z',\n",
    "       '48_day_mean_temp_Z', '64_day_mean_temp_Z']\n",
    "y_var  = 'EVI'\n",
    "# get train test set\n",
    "scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test = ddTrain_Test_Split_Scale(df, X_vars, y_var)\n",
    "# train model and save output \n",
    "IQ_model, IQ_predictions, IQ_r2 = ddRandom_Forest(scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test, n_estimators=20, max_depth = 30)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "8a6d336e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pickle the model and the r2 value for access later! \n",
    "import pickle \n",
    "pckl = \"IQ_model.pkl\"\n",
    "with open(pckl,'wb') as f:\n",
    "    pickle.dump(IQ_model,f)\n",
    "        \n",
    "with open(\"IQ_r2.pkl\", 'wb') as f: \n",
    "    pickle.dump(IQ_r2, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "id": "a55f423d",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Using backend DaskDistributedBackend with 6 concurrent workers.\n",
      "[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:  3.1min finished\n",
      "[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.\n",
      "[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:    0.4s finished\n"
     ]
    }
   ],
   "source": [
    "# run model for third quartile EVI values \n",
    "# drop any records with nan values \n",
    "df = q3_df.dropna()\n",
    "# variables\n",
    "# do not use both sum and mean ppt measurements as they are scalar multiples of one another! \n",
    "# any variables ending in _x are the regular measuements, any variables ending in _Z are the z score of those measurements \n",
    "X_vars = ['16_day_sum_ppt_x', '32_day_sum_ppt_x',\n",
    "       '48_day_sum_ppt_x', '64_day_sum_ppt_x', \n",
    "       '16_day_stdv_ppt', '32_day_stdv_ppt', '48_day_stdv_ppt',\n",
    "       '64_day_stdv_ppt', '16_day_mean_temp_x', '32_day_mean_temp_x',\n",
    "       '48_day_mean_temp_x', '64_day_mean_temp_x', '16_day_stdv_temp',\n",
    "       '32_day_stdv_temp', '48_day_stdv_temp', '64_day_stdv_temp',\n",
    "       'slope', 'elevation',\n",
    "       'bdod0_30cm_weight_mean', 'cfvo0_30cm_weight_mean',\n",
    "       'clay0_30cm_weight_mean', 'sand0_30cm_weight_mean',\n",
    "       'silt0_30cm_weight_mean', 'soc0_30cm_weight_mean',\n",
    "       'bdod30_100cm_weight_mean', 'cfvo30_100cm_weight_mean',\n",
    "       'clay30_100cm_weight_mean', 'sand30_100cm_weight_mean',\n",
    "       'silt30_100cm_weight_mean', 'soc30_100cm_weight_mean', 'year', 'month',\n",
    "       '16_day_sum_ppt_Z', '32_day_sum_ppt_Z', '48_day_sum_ppt_Z',\n",
    "       '64_day_sum_ppt_Z', '16_day_mean_temp_Z', '32_day_mean_temp_Z',\n",
    "       '48_day_mean_temp_Z', '64_day_mean_temp_Z']\n",
    "y_var  = 'EVI'\n",
    "# get train test set\n",
    "scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test = ddTrain_Test_Split_Scale(df, X_vars, y_var)\n",
    "# train model and save output \n",
    "Q3_model, Q3_predictions, Q3_r2 = ddRandom_Forest(scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test, n_estimators=20, max_depth = 30)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "id": "bcb081d5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pickle the model and the r2 value for access later! \n",
    "import pickle \n",
    "pckl = \"Q3_model.pkl\"\n",
    "with open(pckl,'wb') as f:\n",
    "    pickle.dump(Q3_model,f)\n",
    "        \n",
    "with open(\"Q3_r2.pkl\", 'wb') as f: \n",
    "    pickle.dump(Q3_r2, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "id": "61d49d40",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Using backend DaskDistributedBackend with 6 concurrent workers.\n",
      "[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 10.6min finished\n",
      "[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.\n",
      "[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:    1.6s finished\n"
     ]
    }
   ],
   "source": [
    "# run model for third quartile EVI values \n",
    "# drop any records with nan values \n",
    "df = full_df.dropna()\n",
    "# variables\n",
    "# do not use both sum and mean ppt measurements as they are scalar multiples of one another! \n",
    "# any variables ending in _x are the regular measuements, any variables ending in _Z are the z score of those measurements \n",
    "X_vars = ['16_day_sum_ppt_x', '32_day_sum_ppt_x',\n",
    "       '48_day_sum_ppt_x', '64_day_sum_ppt_x', \n",
    "       '16_day_stdv_ppt', '32_day_stdv_ppt', '48_day_stdv_ppt',\n",
    "       '64_day_stdv_ppt', '16_day_mean_temp_x', '32_day_mean_temp_x',\n",
    "       '48_day_mean_temp_x', '64_day_mean_temp_x', '16_day_stdv_temp',\n",
    "       '32_day_stdv_temp', '48_day_stdv_temp', '64_day_stdv_temp',\n",
    "       'slope', 'elevation',\n",
    "       'bdod0_30cm_weight_mean', 'cfvo0_30cm_weight_mean',\n",
    "       'clay0_30cm_weight_mean', 'sand0_30cm_weight_mean',\n",
    "       'silt0_30cm_weight_mean', 'soc0_30cm_weight_mean',\n",
    "       'bdod30_100cm_weight_mean', 'cfvo30_100cm_weight_mean',\n",
    "       'clay30_100cm_weight_mean', 'sand30_100cm_weight_mean',\n",
    "       'silt30_100cm_weight_mean', 'soc30_100cm_weight_mean', 'year', 'month',\n",
    "       '16_day_sum_ppt_Z', '32_day_sum_ppt_Z', '48_day_sum_ppt_Z',\n",
    "       '64_day_sum_ppt_Z', '16_day_mean_temp_Z', '32_day_mean_temp_Z',\n",
    "       '48_day_mean_temp_Z', '64_day_mean_temp_Z']\n",
    "y_var  = 'EVI'\n",
    "# get train test set\n",
    "scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test = ddTrain_Test_Split_Scale(df, X_vars, y_var)\n",
    "# train model and save output \n",
    "full_model, full_predictions, full_r2 = ddRandom_Forest(scaled_X_train, scaled_X_test, scaled_y_train, scaled_y_test, n_estimators=20, max_depth = 30)\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "id": "0a71d44a",
   "metadata": {},
   "outputs": [],
   "source": [
    "# pickle the model and the r2 value for access later! \n",
    "import pickle \n",
    "pckl = \"full_model.pkl\"\n",
    "with open(pckl,'wb') as f:\n",
    "    pickle.dump(full_model,f)\n",
    "        \n",
    "with open(\"full_r2.pkl\", 'wb') as f: \n",
    "    pickle.dump(full_r2, f)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "id": "2fb37cbe",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6051484679514725"
      ]
     },
     "execution_count": 55,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "IQ_r2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "03a759ad",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
